#Import packages
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import seaborn as sns
!pip install pandas-datareader
import pandas_datareader
import datetime
import pandas_datareader.data as web
import requests
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
!pip install TextBlob
from textblob import TextBlob
from pandas_datareader import data as pdr
!pip install yfinance
import yfinance
import scipy.stats as stats
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
nltk.download('stopwords')
!pip install wordcloud
from wordcloud import WordCloud
import re
#Use VADER
!pip install vadersentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
import os
!pip install feedparser
import feedparser as fp
import json
!pip install newspaper3k
import newspaper
from newspaper import Article
from time import mktime
from datetime import datetime, timedelta
import time
import pprint
%matplotlib inline
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud, STOPWORDS,ImageColorGenerator
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer, PorterStemmer
from textblob import TextBlob
from PIL import Image
from os import path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
import os
!pip install feedparser
import feedparser as fp
import json
!pip install newspaper3k
import newspaper
from newspaper import Article
from time import mktime
from datetime import datetime, timedelta
import time
import pprint
import nltk
nltk.download('vader_lexicon')
import json
import requests
from os import makedirs
from os.path import join, exists
from datetime import date, timedelta
import glob
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
#This will be used when creating graphs to order the days and months
order_days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
order_month = ['January','February','March','April','May','June','July','August','September','October','November','December']
ftse = pd.read_csv('FTSE 100 Historical Data.csv')
ftse.head()
ftse.info()
ftse.describe()
#use yfinance and pandas data reader to import the S&P500 index values
SP500 = pdr.get_data_yahoo('^GSPC',
start='2019, 01, 1',
end='2019, 12, 31')
SP500.reset_index(inplace=True)
SP500.head()
SP500.describe()
#Import the exchange rate dataset for AUD/USD
FXUSD = pd.read_csv('USD_GBP Historical Data.csv')
FXUSD.head()
FXUSD.describe()
Comments
FTSE 100 data and USD/GBP exchange rate was obtained from investing.com[9][11] and S&P500 information uses the yfinance and pandas datareader package[12][13].
guardian_api = '8ca75031-03ce-4640-80b3-9accc793d8e6'
ARTICLES_DIR = join('tempdata', 'newsarticles') #Create a temporary directory to save the news articles
makedirs(ARTICLES_DIR, exist_ok=True)
# Sample URL - obtained from guardian to search for news articles
#
# http://content.guardianapis.com/search?q=news
# &api-key=your-api-key-goes-here
API_ENDPOINT = 'http://content.guardianapis.com/search?q=news'
my_params = {
'from-date': "2019-01-01",
'to-date': "2019-12-31",
'order-by': "newest",
'show-fields': 'all',
'page-size': 200,
'api-key': guardian_api
}
my_params
start_date = date(2019, 1, 1)
end_date = date(2019,12, 31) #ONLY ALLOWED 5,000 API REQUESTS SO SPLIT INTO BATCHES
dayrange = range((end_date - start_date).days + 1)
for daycount in dayrange:
dt = start_date + timedelta(days=daycount)
datestr = dt.strftime('%Y-%m-%d')
fname = join(ARTICLES_DIR, datestr + '.json')
if not exists(fname):
# then let's download it
print("Downloading", datestr)
all_results = []
my_params['from-date'] = datestr
my_params['to-date'] = datestr
current_page = 1
total_pages = 1
while current_page <= total_pages:
print("...page", current_page)
my_params['page'] = current_page
resp = requests.get(API_ENDPOINT, my_params)
data = resp.json()
all_results.extend(data['response']['results'])
# if there is more than one page
current_page += 1
total_pages = data['response']['pages']
with open(fname, 'w') as f:
print("Writing to", fname)
# re-serialize it for pretty indentation
f.write(json.dumps(all_results, indent=2)) #create separate json files for each day and save it in the tempdata/articles/ folder
Comments
The Guardian has released an API to request news articles from the website. The Guardian allows 5,000 API calls per day. The range is from 10/05/2005 to 10/05/2020, which has more than 5,000 API calls, hence code has been run over 3 days to avoid reaching the maximum calls.
#Find names of all files in the folder tempdata/newsarticles/ with the extension .json
pattern = 'tempdata/newsarticles/*.json'
json_files = glob.glob(pattern)
print(json_files)
json2 = pd.read_json(json_files[1])
json2
frames = []
#iterate over json_files
for json in json_files:
#read json into a dataframe called newsdata
newsdata = pd.read_json(json)
#append newsdata to frames
frames.append(newsdata)
print('appending...')
#Concatenate frames into a single dataframe called news articles
newsarticles = pd.concat(frames)
#Print the shape of newsarticles
print(newsarticles.shape)
print('completed framing')
newsarticles.head(50)
# Check if there are missing values in the dataset
newsarticles['id'].isna().sum()
newsarticles.shape[0]
print('There are',newsarticles.shape[0],'rows and',newsarticles.shape[1],'columns')
newsarticles['pillarName'].unique()
#Only keep values where the pillarName = 'News'
newsarticles2 = newsarticles[newsarticles['pillarName'] == 'News']
newsarticles2.shape
newsarticles2.head()
newsarticles2.info()
#Create a new dataframe which has the web publication date only
publicationdate = newsarticles2[['webPublicationDate']]
publicationdate.reset_index(drop = True, inplace = True)
publicationdate.head()
#Column fields has the relevant information required for this analysis
newsarticlesdf = newsarticles2['fields'].apply(pd.Series)
print(newsarticlesdf.shape)
#Reset index
newsarticlesdf.reset_index(drop = True, inplace = True)
newsarticlesdf.head()
newsarticlesdf.info()
#Merge the publicationdate df with the newsarticles df to obtain the publication date of the article.
newsarticlesdate = newsarticlesdf.merge(publicationdate,left_index = True, right_index = True)
newsarticlesdate.head()
#Keep necessary columns for this investigation
newsarticlesdf2 = newsarticlesdate[['headline','body','charCount','wordcount','lastModified','publication','webPublicationDate']]
newsarticlesdf2.head()
Comments
Web publication date has been taken from the original dataset and merged to get the original date of publication. The webPublicationdate is present for all rows.
#Convert lastmodified date to date time
newsarticlesdf2['PublicationDate'] = pd.to_datetime(newsarticlesdf2['webPublicationDate'],errors = 'coerce')
#Check for NA
newsarticlesdf2.isna().sum()
#Find the type of each of the columns
ftse.dtypes
SP500.dtypes
#Drop columns that are not needed in the analysis
ftse.drop(labels = ['High','Open','Low','Volume','Chg%'], axis = 'columns', inplace = True)
SP500.drop(labels = ['High','Low','Open','Volume','Adj Close'], axis = 'columns', inplace = True)
ftse.head()
SP500.head()
#Rename Price to Close to match the SP 500 dataset
ftse.columns = ['Date','Close']
#The close price on the FTSE dataset included commas so use replace the commas with '' and then conver to float64 as you can't convert it otherwise
ftse['Date'] = pd.to_datetime(ftse['Date'])
ftse['Close'] = ftse['Close'].replace(',','',regex=True).astype(np.float64)
ftse.head()
#Find the missing dates from the dataset
ftsedates = pd.date_range(start = ftse.Date.min(), end = ftse.Date.max())
ftseNew = ftse.set_index('Date').reindex(ftsedates).rename_axis('Date').reset_index()
ftseNew.head()
#Fill the missing dates which have NA values with the previous date.
ftseNew['Close'].fillna(method='ffill', inplace=True)
ftseNew.head()
SP500.info()
#Find the missing dates from the dataset
SP500dates = pd.date_range(start = SP500.Date.min(), end = SP500.Date.max())
SP500New = SP500.set_index('Date').reindex(SP500dates).rename_axis('Date').reset_index()
SP500New.head()
#Fill the missing dates which have NA values with the previous date.
SP500New['Close'].fillna(method='ffill', inplace=True)
SP500New.head()
#To do time series analysis, set the index to be the date column
ftseTime = ftseNew.set_index('Date')
ftseTime.head()
SP500Time = SP500New.set_index('Date')
SP500Time.head()
#Drop columns Open, High and Low columns as we are only interested with the price on the day
FXUSD.drop(labels = ['Open','High','Low'], axis = 'columns', inplace = True)
FXUSD.head()
FXUSD['Date'] = pd.to_datetime(FXUSD['Date'])
FXUSD.head()
FXUSDdates = pd.date_range(start = FXUSD.Date.min(), end = FXUSD.Date.max())
FXUSDNew = FXUSD.set_index('Date').reindex(FXUSDdates).rename_axis('Date').reset_index()
FXUSDNew['Price'].fillna(method='ffill', inplace=True)
FXUSDTime = FXUSDNew.set_index('Date')
FXUSDTime.head()
Comments
The indicies do not move on weekends or public holidays in the respective country as stock markets are closed. These dates are not included within the data, hence we will find these dates and keep the price the same as the previous days. This applies for the FX rates where the FX markets are generally closed on Christmas Day and New Years day.
# Plot the closing prices for FTSE 100
ftseTime['Close'].plot(grid = True)
plt.ylabel('Close Price')
plt.title('FTSE 100 Close Price')
plt.show()
# Plot the closing prices for DJIA
SP500Time['Close'].plot(grid = True)
plt.ylabel('Close Price')
plt.title('S&P 500 Close Price')
plt.show()
#Plot both FTSE 100 and S&P500 on the same graph
ftseTime['Close'].plot(grid = True, label = "FTSE 100")
SP500Time['Close'].plot(grid = True , label = "S&P 500")
plt.ylabel('Close Price')
plt.title('FTSE 100 & S&P 500')
plt.legend()
plt.show()
print("FTSE 100 reached an all-time high on", ftseNew.max())
print("FTSE 100 reached an all-time low on",ftseNew.min())
print("S&P 500 reached an all-time high on", SP500New.max())
print("S&P 500 reached an all-time low on", SP500New.min())
# Plot the closing prices for DJIA
FXUSDTime['Price'].plot(grid = True)
plt.ylabel('Price')
plt.title('USD/GBP')
plt.show()
print("USD/GBP exchange reached an all-time high on", FXUSDNew.max())
print("USD/GBP exchange reached an all-time low on", FXUSDNew.min())
Features to be applied to the news dataset:
newsarticlesdf2['PublishDate'] = [d.date() for d in newsarticlesdf2['PublicationDate']]
newsarticlesdf2['PublishTime'] = [d.time() for d in newsarticlesdf2['PublicationDate']]
newsarticlesdf2.head()
#Find day of week from the article
newsarticlesdf2['PublishDate'] = pd.to_datetime(newsarticlesdf2['PublishDate'],errors = 'coerce')
newsarticlesdf2['month'] = newsarticlesdf2['PublishDate'].dt.month
newsarticlesdf2['Month full'] = newsarticlesdf2['PublishDate'].dt.strftime('%B')
newsarticlesdf2['year'] = newsarticlesdf2['PublishDate'].dt.year
newsarticlesdf2.head()
newsarticlesdf2.info()
plt.figure(figsize=(10,6))
sns.countplot(newsarticlesdf2['month'])
plt.title('Count of number of headlines published per year')
month = newsarticlesdf2['Month full']
year = newsarticlesdf2['year']
monthbyyear = newsarticlesdf2.groupby([month,year]).size()
monthbyyear = monthbyyear.rename_axis(['Month','Year']).unstack('Month').reindex(columns = order_month)
monthbyyear
sns.heatmap(monthbyyear,cmap = 'coolwarm')
plt.title('Heatmap - count of number of headlines published per month and year')
Comments
There was a spike in the number of articles published in November, potentially due to the cororna pandemic which started taking pace during November.
#Sort the df by publish date and reset the index
newsarticlesdf2.sort_values(by = 'PublishDate',inplace = True)
newsarticlesdf2.reset_index(drop = True, inplace = True)
newsarticlesdf2.head()
newsarticlesdf2.describe()
#WordCount and Character count from the Guardian API is for the body of the text.
#Create a new column for the character and word count of the headline
newsarticlesdf2['headline_text_count'] = newsarticlesdf2['headline'].apply(lambda x: len(str(x).split(" ")))
newsarticlesdf2['headline_char_count'] = newsarticlesdf2['headline'].str.len()
newsarticlesdf2.head()
newsarticlesdf2.describe()
plt.figure(figsize=(10,6))
sns.countplot(newsarticlesdf2['headline_text_count'])
plt.title('Count of number headlines per word count')
plt.figure(figsize=(15,8))
sns.countplot(newsarticlesdf2['headline_char_count'])
plt.title('Count of number of characters in a headline')
Comments
Headlines generally have less words, to grab the readers attention. The average number of words in a headline is 6 words or less, which can be seen in the graphs above [23].
headlines_to_drop = ['Student News:','Newsdesk','in pictures','Picture desk','best photographs','world in pictures','photo highlights','Your photographs of',': 23032007','Student News','Daily Newsdesk','podcast:','StudentNews1:','News quiz:','Helen Boden:','Milling Around','52 weeks:','Underwater photography:','The Illustrated London News','Photo highlights','picture of the day','photo of the day','Eyewitness:','photographer of the year','Video:',' photographs of the day','video:','pictures of the day','Xan Brooks','Activate 2011:','Media Talk:','MediaTalk:','Mediatalk:','Media talk:','In pictures:','365 days:','366 days','The Guardian Essential Report','Mediatalk','MediaTalk','quiz:','tweets:','weekly:','Weekly:','quiz of']
newsarticlesdf3 = newsarticlesdf2[~newsarticlesdf2.headline.str.contains('|'.join(headlines_to_drop))]
newsarticlesdf3.shape
Comments
Remove the above words listed in headlines_to_drop as these headlines may not have an impact on the stock market. In order to precisely forecast the stock price using the news articles containing photos, image processing could be used to see what photos were released.
#Remove time zone from the datetime column - can't export file where timezones are present - not supported by excel.
newsarticlesdf3['PublicationDate']= newsarticlesdf3['PublicationDate'].dt.tz_localize(None)
newsarticlesdf3.head()
newsarticlesdf4 = newsarticlesdf3
newsarticlesdf4.shape
newsarticlesdf4['wordcount'].unique()
#newsarticlesdf4.to_excel('newsarticlesdf4.xlsx')
#newsarticlesdf4 = pd.read_excel('newsarticlesdf4.xlsx')
newsarticlesdf4.shape
newsarticlesdf4.info()
newsarticlesdf4['headline'] = newsarticlesdf4['headline'].astype('str')
newsarticlesdf4['body']=newsarticlesdf4['body'].astype('str')
from wordcloud import WordCloud, STOPWORDS,ImageColorGenerator
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer, PorterStemmer
from textblob import TextBlob
from PIL import Image
from os import path
#code adapted from Data Science Blog [24]
#remove <br> </br> <p> </p> <b> </b> - html tags which were present in the body
newsarticlesdf4 = newsarticlesdf4.replace('<br>', '',regex = True)
newsarticlesdf4 = newsarticlesdf4.replace('</br>', '',regex = True)
newsarticlesdf4 = newsarticlesdf4.replace('<b>', '',regex = True)
newsarticlesdf4 = newsarticlesdf4.replace('</b>', '',regex = True)
newsarticlesdf4 = newsarticlesdf4.replace('<p>', '',regex = True)
newsarticlesdf4 = newsarticlesdf4.replace('</p>', '',regex = True)
newsarticlesdf4.head()
#Use newsarticlesdf4 which contains headlines and body text
#convert text to lowercase
newsarticlesdf4['headline_clean'] = newsarticlesdf4['headline'].apply(lambda x: " ".join(x.lower() for x in x.split()))
newsarticlesdf4['body_clean'] = newsarticlesdf4['body'].apply(lambda x: " ".join(x.lower() for x in x.split()))
newsarticlesdf4.head()
#remove special characters from the text
newsarticlesdf4['headline_clean'] = newsarticlesdf4['headline_clean'].str.replace('[^\w\s]','')
newsarticlesdf4['body_clean'] = newsarticlesdf4['body_clean'].str.replace('[^\w\s]','')
newsarticlesdf4.head()
#Stopwords - remove stop words
stop = stopwords.words('english') #includes words such as I,my,we
#Extend the stop words to remove the below. These words will appear a lot in the headlines, which means
#we would not be able to see the more important words in the headlines
stop.extend(['news','new','from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come','media','photo','photography','bn'])
newsarticlesdf4['headline_clean_stop'] = newsarticlesdf4['headline_clean'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
newsarticlesdf4['body_clean_stop'] = newsarticlesdf4['body_clean'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
newsarticlesdf4.head()
#Create word cloud
text = " ".join(headlines for headlines in newsarticlesdf4.headline)
print ("There are {} words in the combination of all headlines.".format(len(text)))
stopwords = ['BBC','say','new','News','media','Today','will','ad','says','news','from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come','media','photo','photography','bn'] + list(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)
plt.figure(figsize = (15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Word Cloud of all headlines')
plt.show()
The time series plots of the FTSE 100 show four distinct periods, hence investigate the words presents for these periods:
newsJan = newsarticlesdf4.loc[(newsarticlesdf4['month']>= 1) &(newsarticlesdf4['month']<= 3)]
newsApr = newsarticlesdf4.loc[(newsarticlesdf4['month']>= 4) &(newsarticlesdf4['month']<= 6)]
newsJul = newsarticlesdf4.loc[(newsarticlesdf4['month']>= 7) &(newsarticlesdf4['month']<= 9)]
newsOct = newsarticlesdf4.loc[(newsarticlesdf4['month']>= 10) &(newsarticlesdf4['month']<= 12)]
#Create word cloud
textJan = " ".join(headlines for headlines in newsJan.headline)
print ("There are {} words in the combination of all headlines from January to March.".format(len(textJan)))
wordcloudJan = WordCloud(stopwords=stopwords, background_color="white", collocations = False).generate(textJan)
plt.figure(figsize = (10,10))
plt.imshow(wordcloudJan, interpolation='bilinear')
plt.axis("off")
plt.title('Word Cloud for January to March')
plt.show()
#Create word cloud
textApr = " ".join(headlines for headlines in newsApr.headline)
print ("There are {} words in the combination of all headlines from April to June.".format(len(textApr)))
wordcloudApr = WordCloud(stopwords=stopwords, background_color="white", collocations = False).generate(textApr)
plt.figure(figsize = (10,10))
plt.imshow(wordcloudApr, interpolation='bilinear')
plt.axis("off")
plt.title('Word Cloud for April to June')
plt.show()
#Create word cloud
textJul = " ".join(headlines for headlines in newsJul.headline)
print ("There are {} words in the combination of all headlines from July to September.".format(len(textJul)))
wordcloudJul = WordCloud(stopwords=stopwords, background_color="white", collocations = False).generate(textJul)
plt.figure(figsize = (10,10))
plt.imshow(wordcloudJul, interpolation='bilinear')
plt.axis("off")
plt.title('Word Cloud for July to September')
plt.show()
#Create word cloud
textOct = " ".join(headlines for headlines in newsOct.headline)
print ("There are {} words in the combination of all headlines from October to December.".format(len(textOct)))
wordcloudOct = WordCloud(stopwords=stopwords, background_color="white", collocations = False).generate(textOct)
plt.figure(figsize = (10,10))
plt.imshow(wordcloudOct, interpolation='bilinear')
plt.axis("off")
plt.title('Word Cloud for October to December')
plt.show()
Comments
There are four distinct periods when looking at the stock indices graphs
def detect_polarity(text):
return TextBlob(text).sentiment.polarity
newsarticlesdf4['polarity_headline'] = newsarticlesdf4.headline_clean_stop.apply(detect_polarity)
newsarticlesdf4['polarity_body'] = newsarticlesdf4.body_clean_stop.apply(detect_polarity)
newsarticlesdf4.head()
def detect_subjectivity(text):
return TextBlob(text).sentiment.subjectivity
newsarticlesdf4['subjectivity_headline'] = newsarticlesdf4.headline_clean_stop.apply(detect_subjectivity)
newsarticlesdf4['subjectivity_body'] = newsarticlesdf4.body_clean_stop.apply(detect_subjectivity)
newsarticlesdf4.head()
#Distribution of Polarity
num_bins = 20
plt.figure(figsize=(10,6))
n, bins, patches = plt.hist(newsarticlesdf4.polarity_headline, num_bins, facecolor='blue', alpha=0.5)
plt.xlabel('Polarity')
plt.ylabel('Count')
plt.title('Histogram of polarity')
plt.show();
#Distribution of subjectivity headline
num_bins = 20
plt.figure(figsize=(10,6))
n, bins, patches = plt.hist(newsarticlesdf4.subjectivity_headline, num_bins, facecolor='blue', alpha=0.5)
plt.xlabel('Subjectivity')
plt.ylabel('Count')
plt.title('Histogram of Subjectivity of Headline')
plt.show();
#Distribution of Polarity
num_bins = 20
plt.figure(figsize=(10,6))
n, bins, patches = plt.hist(newsarticlesdf4.polarity_body, num_bins, facecolor='blue', alpha=0.5)
plt.xlabel('Polarity')
plt.ylabel('Count')
plt.title('Histogram of polarity')
plt.show();
#Distribution of subjectivity of body
num_bins = 20
plt.figure(figsize=(10,6))
n, bins, patches = plt.hist(newsarticlesdf4.subjectivity_body, num_bins, facecolor='blue', alpha=0.5)
plt.xlabel('Subjectivity')
plt.ylabel('Count')
plt.title('Histogram of polarity')
plt.show();
Comments
TextBlob does not show any meaningful results for the sentiment of the headline. A number of articles have been classified as neutral when they should not be. VADER will be used instead to calculate the sentiment of the headline. TextBlob show some results for the sentiment of the body of the article, however due to computational resources, the body of the article is not in the scope of this investigation.
#Import vader package
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
#apply vader sentiment to output columns neg,pos,neu and compound. Code adapted from [27]
sentiment = newsarticlesdf4['headline'].apply(lambda x: analyzer.polarity_scores(x))
newsarticlesdf4 = pd.concat([newsarticlesdf4,sentiment.apply(pd.Series)],1)
newsarticlesdf4.head()
#Drop neg neu and pos column from the dataset, only interested in the Compound sentiment analysis score
newsarticlesdf4 = newsarticlesdf4.drop(['neg','neu','pos'],axis = 1)
newsarticlesdf4.head()
#Add column to distinguish if the sentiment is positive,negative or neutral
def sentiment_class(num):
if num > 0:
return 'Positive'
elif num <0:
return 'Negative'
else:
return 'Neutral'
newsarticlesdf4['VaderSentiment'] = newsarticlesdf4['compound'].apply(sentiment_class)
newsarticlesdf4.head(1)
plt.figure(figsize=(10,6))
sns.countplot(newsarticlesdf4['month'],hue = newsarticlesdf4['VaderSentiment'])
plt.title('Number of headlines per year split by VaderSentiment')
#Create a dataset from year 2015 onwards and negative sentiment
newsOct = newsarticlesdf4.loc[(newsarticlesdf4['month']>= 10) &(newsarticlesdf4['VaderSentiment'] == 'Negative')]
newsOct.shape
#Create word cloud
textOct = " ".join(headlines for headlines in newsOct.headline_clean_stop)
wordcloudOct = WordCloud(stopwords=stopwords, background_color="white", collocations = False).generate(textOct)
plt.figure(figsize = (10,10))
plt.imshow(wordcloudOct, interpolation='bilinear')
plt.axis("off")
plt.title('Word Cloud for negative sentiment October to December')
plt.show()
Comments
Digging deeper into the increase in negative sentiment headlines for 2015 onwards, the words associated with these headlines are 'fire' ,'attack' ,'labour','death'. There has been an increase in number of pandemics recently as well as Brexit causing uncertainty.
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')
import spacy
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# spacy for lemmatization
import spacy
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline
# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
import warnings
warnings.simplefilter("ignore", DeprecationWarning)
#stem words to the root of each word e.g. publishes becomes publish. Code adapted from StackOverFlor [29]
def stem_word(text):
tokens = text.split()
stemmed_tokens = [PorterStemmer().stem(token) for token in tokens]
return ' '.join(stemmed_tokens)
newsarticlesdf4['headline_clean_stop_stem'] = newsarticlesdf4['headline_clean_stop'].apply(stem_word)
newsarticlesdf4.head()
#Column headline_clean_stop excludes stopwords and punctuation
#Create list of headline clean text
headline_text = newsarticlesdf4.headline_clean_stop_stem.values.tolist()
print(headline_text[:1])
#Code for LDA modelling has been adapted from DataSkunkWorks [30] and Machine Learning Plus [32]
#Tokenize words which is required by LDA
def doc_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence),deacc = True))
headlinewords = list(doc_words(headline_text))
print(headlinewords[:1])
import gensim.corpora as corpora
id2word = corpora.Dictionary(headlinewords)
# Create Corpus
corpus = [id2word.doc2bow(text) for text in headlinewords]
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=4,
per_word_topics=True)
# Compute Perplexity score
print('\nPerplexity: ', lda_model.log_perplexity(corpus))
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=headlinewords, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)
from pprint import pprint
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis
Comments
From the visualisation above we see that even when setting the number of topics to 4. In addition, the coherence score of the model is quite low, indicating that this model may not be optimal.
Note
As this is an unsupervised algorithm the topic classifcations change each time.
# Find the optimal number of topics for LDA using gensim
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
"""
Compute c_v coherence for various number of topics
Parameters:
----------
dictionary : Gensim dictionary
corpus : Gensim corpus
texts : List of input texts
limit : Max num of topics
Returns:
-------
model_list : List of LDA topic models
coherence_values : Coherence values corresponding to the LDA model with respective number of topics
"""
coherence_values = []
model_list = []
for num_topics in range(start, limit, step):
model = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word,per_word_topics=True)
model_list.append(model)
coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_values.append(coherencemodel.get_coherence())
return model_list, coherence_values
#Find optimal number of topics - TAKES LONG TO RUN
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=headlinewords, start=2, limit=60, step=6)
#Show graph
limit=60; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.title('Finding the optimal number of topics')
plt.show()
# Print the coherence scores
for y, cv in zip(x, coherence_values):
print("Num Topics =", y, " has Coherence Value of", round(cv, 4))
Comments
Coherence allows us to judge how good the topic model is. From the graph above, the coherence score is at its highest at 50 topics. We will change the LDA topic number to 50 below and visualize them.
id2word = corpora.Dictionary(headlinewords)
#Create Corpus
corpus = [id2word.doc2bow(text) for text in headlinewords]
#Build LDA model
lda_model_optimise = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=50,
per_word_topics=True)
from pprint import pprint
pprint(lda_model_optimise.print_topics())
doc_lda_optimise = lda_model_optimise[corpus]
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_optimise, corpus, id2word)
vis
Comments
38 topics shows the highest coherent score which shows the model is optimal. However, when applying 38 topics nearly all topics overlap. Looking at the four topics generated we can't easily give them a label, hence we can't generate topics reliably. Classifying topics for different years may yield better results.
model_list = lda_model.print_topics()
model_list
lda_model.show_topic(1)
lda_model[corpus]
lda_model[corpus[1]]
test_topics = lda_model.get_document_topics(corpus[0])
test_topics
test_topics_max = max(test_topics, key=lambda x: x[1])
test_topics_max
test_topics2 = pd.DataFrame(list(test_topics_max))
test_topics2 = test_topics2.transpose()
test_topics2.columns=['Topic','Percentage']
test_topics2
#Get the topics and their percentage contribution for the headline
get_document_topics = [lda_model.get_document_topics(item) for item in corpus]
document_topics_df = pd.DataFrame(list(get_document_topics))
document_topics_df.head()
document_topics_df.shape
#Find the maximum percentage contribution for the row to identify the dominant topic
test_max = [max(p,key=lambda x: x[1]) for p in get_document_topics]
test_max
#Apply the above to a dataframe and rename the columns to topic and percentage contribution
doc_topics = pd.DataFrame(test_max)
doc_topics.columns = ['Topic','Percentage_Contribution']
doc_topics
#Join back to original df newsarticlesdf4
newsarticlesdf5 = newsarticlesdf4.merge(doc_topics,left_index = True, right_index = True)
newsarticlesdf5.head()
#Output the words associated with each topic
x = lda_model.show_topics()
nwords = {}
for topic,word in x:
nwords[topic] = re.sub('[^A-Za-z ]+','',word)
nwords
#Apply above to a df to join to newsarticlesdf5
topic_words = pd.DataFrame(list(nwords.items()),columns=['Topic', 'Topic_Words'])
topic_words
#Join to newsarticlesdf5 to get topic words and sort by PublishDate
newsarticlesdf6 = pd.merge(newsarticlesdf5,topic_words,left_on = 'Topic', right_on = 'Topic')
newsarticlesdf6.sort_values(by=['PublishDate'], inplace=True)
newsarticlesdf6.reset_index(drop = True, inplace = True)
newsarticlesdf6.head()
newsarticlesdf6.shape
#Run Vader on topic words
topic_sentiment = newsarticlesdf6['Topic_Words'].apply(lambda x: analyzer.polarity_scores(x))
newsarticlesdf7 = pd.concat([newsarticlesdf6,topic_sentiment.apply(pd.Series)],1)
newsarticlesdf7.head()
newsarticlesdf7.info()
newsarticlesdf8 = newsarticlesdf7[['PublishDate','compound']]
newsarticlesdf8.columns = ['PublishDate','HeadlineCompound','TopicCompound']
newsarticlesdf8.reset_index()
newsarticlesdf8.head(20)
newsarticlesdf9 = newsarticlesdf8.groupby('PublishDate').mean()
newsarticlesdf9.head()
Features to be added to the stock market data
#Join the ASX200 dataset and FXAud dataset in order to apply conversion to the S&P/ASX200 price to USD.
SP500Conversion = pd.merge(SP500Time,FXUSDTime, on = 'Date')
SP500Conversion.columns = ['SP500_Close_USD','USD_Price','Chg%','Change %']
SP500Conversion['SP500_Close_GBP'] = SP500Conversion['SP500_Close_USD'] * SP500Conversion['USD_Price']
SP500Conversion.drop(labels = ['Chg%'], axis = 'columns', inplace = True)
SP500Conversion.head()
SP500Conversion['SP500_Close_GBP'].plot(grid = True, label = "S&P 500")
ftseTime['Close'].plot(grid = True , label = "FTSE 100")
plt.ylabel('Close Price')
plt.title('S&P 500 & FTSE 100')
plt.legend()
plt.show()
#Exponential Weighted Moving Average using 253 days, indicating an exponentially weight moving average over a year.
#The average number of trading days per calendar year is 253 days [33]
ftseTime['EWMA'] = ftseTime['Close'].ewm(span=253).mean()
ftseTime.head()
ftseTime[['Close','EWMA']].plot()
EWMA Strategy
ftseTime['Difference'] = ftseTime['Close'] - ftseTime['EWMA']
ftseTime.head()
def trading_strat(num):
if num > 0:
return "Long"
elif num <0:
return "Short"
else:
return "Hold"
ftseTime['Trading_Strategy'] = ftseTime['Difference'].apply(trading_strat)
ftseTime.head()
#Create new index for ftseTime df
ftseTime1 = ftseTime.reset_index()
ftseTime1.head()
#Create new index for newsarticlesdf9 which contains sentiment analysis on headline and topic
newsarticlesdf10 = newsarticlesdf9.reset_index()
newsarticlesdf10.head()
ftseSentiment = pd.merge(ftseTime1,newsarticlesdf10,left_on = "Date", right_on = "PublishDate", how = "left")
ftseSentiment.head()
Comments
The initial strategy looks at the Exponential Weighted Moving Average over a calendar to year decide whether to buy, hold or sell the index. Incorporating the news headlines sentiment will be looked into now.
The logic for the new trading strategy will be:
def strategy(s):
if (s['Trading_Strategy'] == "Long") & (s['HeadlineCompound'] > 0):
return "Buy"
elif (s['Trading_Strategy'] == "Short") & (s['HeadlineCompound'] <0):
return "Sell"
else:
return "Hold"
ftseSentiment['Sentiment_Strategy'] = ftseSentiment.apply(strategy,axis = 1)
ftseSentiment.head()
ftseSentiment['Sentiment_Strategy'].unique()
#Make Date the index for the dataframe
ftseSentiment1 = ftseSentiment.set_index('Date')
ftseSentiment1.head()
#Normalize the close price and EWMA to plot headline sentiment values on same graph using standardscaler.
ftseSentiment1[['Close','EWMA','HeadlineCompound',]] = StandardScaler().fit_transform(ftseSentiment[['Close','EWMA','HeadlineCompound']])
ftseSentiment1.head()
#Plot headlinecompound first to overlay the index values on top
ftseSentiment1[['HeadlineCompound','Close','EWMA']].plot(figsize = (20,15))
plt.title('FTSE 100 with 1 year EWMA and headline sentiment')
Comments
There is a lot of noise which can be seen above from the sentiment analysis on the news headlines. The lag of when news headlines are published and when this is incorporated in the price of the index can be seen. In mid 2019, a news headline had a high positive sentiment score and a few days later the stock price had increased. To improve on this, key words should be looked at to look at the impact on the stock market.
#Apply the EWMA with a period of 1 year similar to the FTSE 100 dataframe
SP500Conversion['EWMA'] = SP500Conversion['SP500_Close_USD'].ewm(span=253).mean()
SP500Conversion.head()
SP500Conversion[['SP500_Close_USD','EWMA']].plot()
SP500Conversion['Difference'] = SP500Conversion['SP500_Close_USD'] - SP500Conversion['EWMA']
SP500Conversion.head()
def trading_strat(num):
if num > 0:
return "Long"
elif num <0:
return "Short"
else:
return "Hold"
SP500Conversion['Trading_Strategy'] = SP500Conversion['Difference'].apply(trading_strat)
SP500Conversion.head()
#Add sentiment scores to the stock data
# steps - reset index for both ftse and news articles df
#Join on date
#plot sentiment against the time series plot
#Create new index for ftseTime df
SP500Conversion1 = SP500Conversion.reset_index()
SP500Conversion1.head()
SP500Sentiment = pd.merge(SP500Conversion1,newsarticlesdf10,left_on = "Date", right_on = "PublishDate", how = "left")
SP500Sentiment.head()
#Make Date the index for the dataframe
SP500Sentiment1 = SP500Sentiment.set_index('Date')
SP500Sentiment1.head()
#Normalize the close price and EWMA to plot headline sentiment values on same graph using standardscaler.
SP500Sentiment1[['SP500_Close_USD','EWMA','HeadlineCompound']] = StandardScaler().fit_transform(SP500Sentiment1[['SP500_Close_USD','EWMA','HeadlineCompound']])
SP500Sentiment1.head()
#Plot headlinecompound first to overlay the index values on top
SP500Sentiment1[['HeadlineCompound','SP500_Close_USD','EWMA']].plot(figsize = (20,15))
plt.title('S&P 500 with 1 year EWMA and headline sentiment')
Comments
There is a considerable amount of noise present in the news headline sentiment. However, there are some trends such as in June the stock price hit a low point and there was a negative news sentiment.
plt.subplots(figsize = (10,6))
graph1 = sns.heatmap(ftseSentiment.corr(method = 'spearman'),cmap = 'coolwarm',annot = True)
plt.title('Correlation Heatmap of FTSE 100 and Headline Sentiment')
plt.subplots(figsize = (10,6))
graph1 = sns.heatmap(SP500Sentiment.corr(method = 'spearman'),cmap = 'coolwarm',annot = True)
plt.title('Correlation Heatmap of S&P 500 and Headline Sentiment')
Comments
Spearman correlation is used to measure the degree of association between two variables [34]. There is a weak negative correlation between the headline sentiment and closing price of the stock index for both FTSE 100 and S&P500. This may produce poor results when looking at how the stock price changes with news headlines.
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
ftseSentiment.head()
#Format date to ordinal to run through linear regression model as linear regression model does not accept date as datetime format
import datetime as dt
ftseSentiment['DateOrdinal'] = ftseSentiment['Date'].map(dt.datetime.toordinal)
ftseSentiment['PublishDateOrdinal'] = ftseSentiment['PublishDate'].map(dt.datetime.toordinal)
ftseSentiment.head()
X = ftseSentiment[['DateOrdinal','PublishDateOrdinal','HeadlineCompound']]
y = ftseSentiment['Close']
X.head()
y.head()
#Split the dataframe into training and test sets. Code for MLR adapted from StackAbuse.com[35]
#Enter a value for random state for reproducibility
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 42)
#Create MLR
from sklearn.linear_model import LinearRegression
mlr_mod = LinearRegression()
X_train.fillna(X_train.mean(),inplace = True)
X_test.fillna(X_test.mean(),inplace = True)
#Fit the model
mlr_mod.fit(X_train,y_train)
#make predictions on the test set
pred_price = mlr_mod.predict(X_test)
pred_price[1]
#compute RMSE and R2 statistics
test_set_rmse = (np.sqrt(mean_squared_error(y_test,pred_price)))
test_set_r2 = r2_score(y_test,pred_price)
print(test_set_rmse)
print(test_set_r2)
prediction_values = pd.DataFrame({'Actual':y_test,'Predicted':pred_price})
prediction_values.head()
X_test
y_test
#Merge predicted values to the X_test by index
predicted_test_set = X_test.merge(prediction_values,left_index = True, right_index = True)
predicted_test_set.reset_index(drop = True, inplace = True)
predicted_test_set.head()
dt = datetime.fromordinal(736390)
dt
#Change Date column from ordinal to datetime to plot
predicted_test_set['NewDate'] = predicted_test_set['DateOrdinal'].map(date.fromordinal)
predicted_test_set.head()
#Sort the df by publish date and reset the index
predicted_test_set.sort_values(by = 'NewDate',inplace = True)
predicted_test_set.reset_index(drop = True, inplace = True)
predicted_test_set.head()
#Set NewDate as the index
predicted_test_set1 = predicted_test_set.set_index('NewDate')
predicted_test_set1.head()
predicted_test_set[['Actual','Predicted']].plot(figsize = (10,6))
plt.title('Actual vs Predicted Closing Price values of FTSE 100 using Multiple Linear Regression')
Comments
Although the predicted values follow the same trend as the actual closing price values, the linear regression model does not capture the sudden drops of the price such as during the financial crisis. If the linear regression model was implemented to predict stock prices, heavy losses would have been suffered during the financial crisis.
from sklearn.ensemble.forest import RandomForestRegressor
#Code for Random Forest adapted from PythonData.com [36]
RF_Model = RandomForestRegressor(n_estimators = 100, oob_score = True)
#Fit the model
randomforest = RF_Model.fit(X_train,y_train)
rf_test_pred = RF_Model.predict(X_test)
rf_test_pred[1]
rf_test_set_rmse = (np.sqrt(mean_squared_error(y_test,rf_test_pred)))
rf_test_set_r2 = r2_score(y_test,rf_test_pred)
print(rf_test_set_rmse)
print(rf_test_set_r2)
rf_prediction_values = pd.DataFrame({'Actual':y_test,'Predicted':rf_test_pred})
rf_prediction_values.head()
#Merge predicted values to the X_test by index
rf_predicted_test_set = X_test.merge(rf_prediction_values,left_index = True, right_index = True)
rf_predicted_test_set.reset_index(drop = True, inplace = True)
rf_predicted_test_set.head()
#Change Date column from ordinal to datetime to plot
rf_predicted_test_set['NewDate'] = rf_predicted_test_set['DateOrdinal'].map(date.fromordinal)
rf_predicted_test_set.head()
#Sort the df by publish date and reset the index
rf_predicted_test_set.sort_values(by = 'NewDate',inplace = True)
rf_predicted_test_set.reset_index(drop = True, inplace = True)
rf_predicted_test_set.head()
rf_predicted_test_set[['Actual','Predicted']].plot(figsize = (10,6))
plt.title('Actual vs Predicted Closing Price values of FTSE 100 using Random Forest Regression')
Comments
The Random Forest Regression model follows the upward trend of the FTSE 100 close price. Providing better way to predict. The RF model has a R2 statistic of 0.99689 indicating that the model has overfitted.